{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Naive Bayes - Trabalho\n",
    "\n",
    "## Questão 1\n",
    "\n",
    "Implemente um classifacor Naive Bayes para o problema de predizer a qualidade de um carro. Para este fim, utilizaremos um conjunto de dados referente a qualidade de carros, disponível no [UCI](https://archive.ics.uci.edu/ml/datasets/car+evaluation). Este dataset de carros possui as seguintes features e classe:\n",
    "\n",
    "** Attributos **\n",
    "1. buying: vhigh, high, med, low\n",
    "2. maint: vhigh, high, med, low\n",
    "3. doors: 2, 3, 4, 5, more\n",
    "4. persons: 2, 4, more\n",
    "5. lug_boot: small, med, big\n",
    "6. safety: low, med, high\n",
    "\n",
    "** Classes **\n",
    "1. unacc, acc, good, vgood\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import csv\n",
    "import math"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    " def loadCsv(filename):\n",
    "    lines = csv.reader(open(filename, \"r\"))\n",
    "    dataset = list(lines)\n",
    "    for i in range(len(dataset)):\n",
    "        dataset[i] = [str(x) for x in dataset[i]]\n",
    "    return dataset\n",
    "\n",
    "import random\n",
    "def splitDataset(dataset, splitRatio):\n",
    "    trainSize = int(len(dataset) * splitRatio)\n",
    "    trainSet = []\n",
    "    copy = list(dataset)\n",
    "    while len(trainSet) < trainSize:\n",
    "        index = random.randrange(len(copy))\n",
    "        trainSet.append(copy.pop(index))\n",
    "    return [trainSet, copy]\n",
    "\n",
    "def separateByClass(dataset):\n",
    "    separated = {}\n",
    "    for i in range(len(dataset)):\n",
    "        vector = dataset[i]\n",
    "        if (vector[-1] not in separated):\n",
    "            separated[vector[-1]] = []\n",
    "        separated[vector[-1]].append(vector)\n",
    "    return separated\n",
    "\n",
    "\n",
    "def mean(numbers):\n",
    "    return sum(numbers)/float(len(numbers))\n",
    " \n",
    "def stdev(numbers):\n",
    "    avg = mean(numbers)\n",
    "    variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)\n",
    "    return math.sqrt(variance)\n",
    "\n",
    "def summarize(dataset):\n",
    "    summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]\n",
    "    del summaries[-1]\n",
    "    return summaries\n",
    "\n",
    "def summarizeByClass(dataset):\n",
    "    separated = separateByClass(dataset)\n",
    "    summaries = {}\n",
    "    for classValue, instances in separated.items():\n",
    "        summaries[classValue] = summarize(instances)\n",
    "    return summaries\n",
    "\n",
    "def calculateProbability(x, mean, stdev):\n",
    "    if (stdev == 0.0):\n",
    "        exponent = 0.0\n",
    "        retorno = 0.0\n",
    "    else:\n",
    "        exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))\n",
    "        retorno = (1 / (math.sqrt(2*math.pi) * stdev)) * exponent\n",
    "    \n",
    "    return retorno\n",
    "\n",
    "def calculateClassProbabilities(summaries, inputVector):\n",
    "    probabilities = {}\n",
    "    for classValue, classSummaries in summaries.items():\n",
    "        probabilities[classValue] = 1\n",
    "        for i in range(len(classSummaries)):\n",
    "            mean, stdev = classSummaries[i]\n",
    "            x = inputVector[i]\n",
    "            probabilities[classValue] *= calculateProbability(x, mean, stdev)\n",
    "    return probabilities\n",
    "\n",
    "def predict(summaries, inputVector):\n",
    "    probabilities = calculateClassProbabilities(summaries, inputVector)\n",
    "    bestLabel, bestProb = None, -1\n",
    "    for classValue, probability in probabilities.items():\n",
    "        if bestLabel is None or probability > bestProb:\n",
    "            bestProb = probability\n",
    "            bestLabel = classValue\n",
    "    return bestLabel\n",
    "\n",
    "def getPredictions(summaries, testSet):\n",
    "    predictions = []\n",
    "    for i in range(len(testSet)):\n",
    "        result = predict(summaries, testSet[i])\n",
    "        predictions.append(result)\n",
    "    return predictions\n",
    "\n",
    "def getAccuracy(testSet, predictions):\n",
    "    correct = 0\n",
    "    for i in range(len(testSet)):\n",
    "        if testSet[i][-1] == predictions[i]:\n",
    "            correct += 1\n",
    "    return (correct/float(len(testSet))) * 100.0\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "ds = loadCsv('carData.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "j = 0;\n",
    "vhigh = 0;\n",
    "small = 0;\n",
    "low = 0;\n",
    "acc = 0;\n",
    "high = 0;\n",
    "big = 0;\n",
    "med = 0;\n",
    "unacc = 0;\n",
    "\n",
    "\n",
    "\n",
    "for i in range(7):\n",
    "    if ds[i][0] == 'vhigh':\n",
    "        vhigh = vhigh + 1;\n",
    "    if ds[i][0] == 'high':\n",
    "        high = high + 1;\n",
    "    if ds[i][0] == 'med':\n",
    "        med = med + 1;\n",
    "    if ds[i][0] == 'low':\n",
    "        low = low + 1;\n",
    "\n",
    "for i in range(7):\n",
    "    if ds[i][1] == 'vhigh':\n",
    "        vhigh = vhigh + 1;\n",
    "    if ds[i][1] == 'high':\n",
    "        high = high + 1;\n",
    "    if ds[i][1] == 'med':\n",
    "        med = med + 1;\n",
    "    if ds[i][1] == 'low':\n",
    "        low = low + 1;\n",
    "        ds[i][1] = low;\n",
    "\n",
    "    \n",
    "        \n",
    "        \n",
    "        \n",
    "for i in range(len(ds)):\n",
    "    for j in range(7):\n",
    "        if (ds[i][j] == 'low' or ds[i][j] == 'small' or ds[i][j] == 'unacc'):\n",
    "            ds[i][j] = 1.0\n",
    "        if (ds[i][j] == 'med' or ds[i][j] == 'acc' or ds[i][j] == '2'):\n",
    "            ds[i][j] = 2.0\n",
    "        if (ds[i][j] == 'high' or ds[i][j] == 'big' or ds[i][j] == 'good' or ds[i][j] == '3'):\n",
    "            ds[i][j] = 3.0\n",
    "        if (ds[i][j] == 'vhigh' or ds[i][j] == 'vgood' or ds[i][j] == '4'):\n",
    "            ds[i][j] = 4.0\n",
    "        if (ds[i][j] == 'more'):\n",
    "            ds[i][j] = 5.0\n",
    "        if (ds[i][j] == '5more'):\n",
    "            ds[i][j] = 6.0\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "79.16666666666666"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train, test = splitDataset(ds, 0.75)\n",
    "summaries = summarizeByClass(train)\n",
    "predictions = getPredictions(summaries, test)\n",
    "accuracy = getAccuracy(test, predictions)\n",
    "\n",
    "accuracy"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "## Questão 2\n",
    "Crie uma versão de sua implementação usando as funções disponíveis na biblioteca SciKitLearn para o Naive Bayes ([veja aqui](http://scikit-learn.org/stable/modules/naive_bayes.html)) \n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.naive_bayes import GaussianNB\n",
    "import pandas as pd\n",
    "from sklearn import datasets\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import accuracy_score, classification_report\n",
    "from sklearn.preprocessing import LabelEncoder, MinMaxScaler"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>buying</th>\n",
       "      <th>maint</th>\n",
       "      <th>doors</th>\n",
       "      <th>persons</th>\n",
       "      <th>lug_boot</th>\n",
       "      <th>safety</th>\n",
       "      <th>classe</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   buying  maint  doors  persons  lug_boot  safety  classe\n",
       "0       3      3      0        0         2       1       2\n",
       "1       3      3      0        0         2       2       2\n",
       "2       3      3      0        0         2       0       2\n",
       "3       3      3      0        0         1       1       2\n",
       "4       3      3      0        0         1       2       2"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv('carData.csv', names=['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'classe'])\n",
    "\n",
    "df['buying'] = LabelEncoder().fit(df.buying).transform(df.buying)\n",
    "df['maint'] = LabelEncoder().fit(df.maint).transform(df.maint)\n",
    "df['doors'] = LabelEncoder().fit(df.doors).transform(df.doors)\n",
    "df['persons'] = LabelEncoder().fit(df.persons).transform(df.persons)\n",
    "df['lug_boot'] = LabelEncoder().fit(df.lug_boot).transform(df.lug_boot)\n",
    "df['safety'] = LabelEncoder().fit(df.safety).transform(df.safety)\n",
    "df['classe'] = LabelEncoder().fit(df.classe).transform(df.classe)\n",
    "\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "gnb = GaussianNB()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "features = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety']\n",
    "X = df.get(features)\n",
    "Y = df['classe'].values\n",
    "X = MinMaxScaler().fit_transform(df[features])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.64973730297723298"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.33, random_state=1)\n",
    "y_pred = gnb.fit(X_train, Y_train).predict(X_test)\n",
    "accuracy_score(Y_test, y_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             precision    recall  f1-score   support\n",
      "\n",
      "          0       0.12      0.62      0.21        24\n",
      "          1       0.00      0.00      0.00         0\n",
      "          2       0.83      0.85      0.84       396\n",
      "          3       1.00      0.14      0.24       151\n",
      "\n",
      "avg / total       0.85      0.65      0.65       571\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/374199/miniconda2/envs/ambiente_1/lib/python3.6/site-packages/sklearn/metrics/classification.py:1137: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples.\n",
      "  'recall', 'true', average, warn_for)\n"
     ]
    }
   ],
   "source": [
    "print(classification_report(y_pred, Y_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
